Link to kaggle, where we found our data set
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
lm_rec <- recipe(median_house_value ~ ., data = housing) %>%
step_normalize(all_numeric_predictors())
lm_wf <- workflow() %>%
add_recipe(lm_rec) %>%
add_model(lm_spec)
housing_lm_wf_1 <- workflow() %>%
add_recipe(lm_rec) %>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_2 <- workflow() %>%
add_formula(median_house_value ~ median_income)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_3 <- workflow() %>%
add_formula(median_house_value ~ median_income + longitude)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_4 <- workflow() %>%
add_formula(median_house_value ~ median_income + longitude + latitude)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_5 <- workflow() %>%
add_formula(median_house_value ~ median_income + longitude + latitude + total_rooms)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_6 <- workflow() %>%
add_formula(median_house_value ~ median_income + longitude + latitude + total_rooms + population)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
mod1 <- fit(lm_spec,
median_house_value ~ .,
data = housing)
mod2 <- fit(lm_spec,
median_house_value ~ median_income,
data = housing)
mod3 <- fit(lm_spec,
median_house_value ~ median_income + longitude,
data = housing)
mod4 <- fit(lm_spec,
median_house_value ~ median_income + longitude + latitude,
data = housing)
mod5 <- fit(lm_spec,
median_house_value ~ median_income + longitude + latitude + total_rooms,
data = housing)
mod6 <- fit(lm_spec,
median_house_value ~ median_income + longitude + latitude + total_rooms + population,
data = housing)
mod1 %>%
tidy() %>%
slice(-1) %>%
mutate(lower = estimate - 1.96*std.error, upper = estimate + 1.96*std.error) %>%
ggplot() +
geom_vline(xintercept=0, linetype=4) +
geom_point(aes(x=estimate, y=term)) +
geom_segment(aes(y=term, yend=term, x=lower, xend=upper),
arrow = arrow(angle=90, ends='both', length = unit(0.1, 'cm'))) +
labs(x = 'Coefficient estimate (95% CI)', y = 'Feature') +
theme_classic()
mod1 %>%
tidy()
## # A tibble: 13 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2273908. 88456. -25.7 1.88e-143
## 2 longitude -26813. 1020. -26.3 6.60e-150
## 3 latitude -25482. 1005. -25.4 9.39e-140
## 4 housing_median_age 1073. 43.9 24.4 4.85e-130
## 5 total_rooms -6.19 0.791 -7.83 5.32e- 15
## 6 total_bedrooms 101. 6.87 14.6 2.73e- 48
## 7 population -38.0 1.08 -35.3 9.35e-265
## 8 households 49.6 7.45 6.66 2.83e- 11
## 9 median_income 39260. 338. 116. 0
## 10 ocean_proximity2 3954. 1913. 2.07 3.88e- 2
## 11 ocean_proximity3 8232. 2176. 3.78 1.55e- 4
## 12 ocean_proximity4 156856. 30778. 5.10 3.49e- 7
## 13 ocean_proximity5 -35330. 2336. -15.1 2.23e- 51
mod1_output <- mod1 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod2_output <- mod2 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod3_output <- mod3 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod4_output <- mod4 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod5_output <- mod5 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod6_output <- mod6 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod1_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 49748.
mod2_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 62598.
mod3_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 62486.
mod4_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 54807.
mod5_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 54780.
mod6_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 53892.
ggplot(mod1_output, aes(y=resid, x=median_house_value)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#not very useful?
ggplot(mod1_output, aes(y=resid, x=housing_median_age)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#not very useful
ggplot(mod1_output, aes(y=resid, x=ocean_proximity)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=longitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=latitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=total_rooms)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=median_income)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=total_bedrooms)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=total_rooms)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=median_house_value)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=longitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=latitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=median_income)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=population)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod3_output, aes(y=resid, x=population)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod3_output, aes(y=resid, x=total_rooms)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod6_output, aes(y=resid, x=longitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
mod1_cv <- fit_resamples(housing_lm_wf_1,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod2_cv <- fit_resamples(housing_lm_wf_2,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod3_cv <- fit_resamples(housing_lm_wf_3,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod4_cv <- fit_resamples(housing_lm_wf_4,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod5_cv <- fit_resamples(housing_lm_wf_5,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod6_cv <- fit_resamples(housing_lm_wf_6,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod1_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 68711.
mod2_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 83701.
mod3_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 83610.
mod4_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 74421.
mod5_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 74386.
mod6_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 73070.
mod1_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 49798. 10 435. Preprocessor1_Model1
## 2 rmse standard 68711. 10 798. Preprocessor1_Model1
## 3 rsq standard 0.646 10 0.00588 Preprocessor1_Model1
mod2_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 62609. 10 532. Preprocessor1_Model1
## 2 rmse standard 83701. 10 888. Preprocessor1_Model1
## 3 rsq standard 0.474 10 0.00782 Preprocessor1_Model1
mod3_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 62503. 10 553. Preprocessor1_Model1
## 2 rmse standard 83610. 10 898. Preprocessor1_Model1
## 3 rsq standard 0.475 10 0.00798 Preprocessor1_Model1
mod4_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 54826. 10 478. Preprocessor1_Model1
## 2 rmse standard 74421. 10 791. Preprocessor1_Model1
## 3 rsq standard 0.584 10 0.00545 Preprocessor1_Model1
mod5_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 54802. 10 474. Preprocessor1_Model1
## 2 rmse standard 74386. 10 792. Preprocessor1_Model1
## 3 rsq standard 0.585 10 0.00552 Preprocessor1_Model1
mod6_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 53916. 10 442. Preprocessor1_Model1
## 2 rmse standard 73070. 10 799. Preprocessor1_Model1
## 3 rsq standard 0.599 10 0.00578 Preprocessor1_Model1
lasso_spec <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>% ## mixture = 1 indicates Lasso, we'll choose penalty later
set_engine(engine = 'glmnet') %>% #note we are using a different engine
set_mode('regression')
lasso_rec <- recipe(median_house_value ~ ., data = housing) %>%
step_nzv(all_predictors()) %>% # removes variables with the same value
#step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables
step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
step_dummy(all_nominal_predictors()) # creates indicator variables for categorical variables
lasso_wf <- workflow() %>%
add_recipe(lasso_rec) %>%
add_model(lasso_spec)
# Tune LASSO #1
penalty_grid <- grid_regular(
penalty(range = c(-5,5)), #log10 transformed 10^-5 to 10^3
levels = 30)
# Tune LASSO #2
tune_res <- tune_grid( # new function for tuning parameters
lasso_wf, # workflow
resamples = data_cv10, # cv folds
metrics = metric_set(rmse, mae),
grid = penalty_grid)
# LASSO model
lasso_fit <- lasso_wf %>%
fit(data = housing) # Fit to data
# plotting LASSO lambda
plot(lasso_fit %>%
extract_fit_parsnip() %>%
pluck('fit'), # way to get the original glmnet output
xvar = "lambda")
# Visualize LASSO Metrics from Tuning
autoplot(tune_res) + theme_classic()
# Summarize LASSO CV Metrics
collect_metrics(tune_res) %>%
filter(.metric == 'mae') %>% # or choose mae
select(penalty, mae = mean)
## # A tibble: 30 × 2
## penalty mae
## <dbl> <dbl>
## 1 0.00001 49791.
## 2 0.0000221 49791.
## 3 0.0000489 49791.
## 4 0.000108 49791.
## 5 0.000240 49791.
## 6 0.000530 49791.
## 7 0.00117 49791.
## 8 0.00259 49791.
## 9 0.00574 49791.
## 10 0.0127 49791.
## # … with 20 more rows
# choose penalty value based on lowest mae or rmse
best_penalty <- select_best(tune_res, metric = 'mae')
best_penalty
## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 78.8 Preprocessor1_Model21
# choose penalty value based on the largest penalty within 1 se of the lowest CV MAE
best_se_penalty <- select_by_one_std_err(tune_res, metric = 'mae', desc(penalty))
# Fit Final LASSO Models
final_wf <- finalize_workflow(lasso_wf, best_penalty) # incorporates penalty value to workflow
final_wf_se <- finalize_workflow(lasso_wf, best_se_penalty) # incorporates penalty value to workflow
final_fit <- fit(final_wf, data = housing)
final_fit_se <- fit(final_wf_se, data = housing)
tidy(final_fit)
## # A tibble: 13 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 216740. 78.8
## 2 longitude -50809. 78.8
## 3 latitude -51528. 78.8
## 4 housing_median_age 13453. 78.8
## 5 total_rooms -11697. 78.8
## 6 total_bedrooms 40214. 78.8
## 7 population -42467. 78.8
## 8 households 18813. 78.8
## 9 median_income 74149. 78.8
## 10 ocean_proximity_X2 2872. 78.8
## 11 ocean_proximity_X3 7510. 78.8
## 12 ocean_proximity_X4 152032. 78.8
## 13 ocean_proximity_X5 -38212. 78.8
tidy(final_fit_se)
## # A tibble: 13 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 223724. 853.
## 2 longitude -28950. 853.
## 3 latitude -28678. 853.
## 4 housing_median_age 13219. 853.
## 5 total_rooms 0 853.
## 6 total_bedrooms 26949. 853.
## 7 population -36168. 853.
## 8 households 14685. 853.
## 9 median_income 71425. 853.
## 10 ocean_proximity_X2 0 853.
## 11 ocean_proximity_X3 6385. 853.
## 12 ocean_proximity_X4 109045. 853.
## 13 ocean_proximity_X5 -55699. 853.
lasso_mod_out <- final_fit_se %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
lasso_mod_out %>%
ggplot(aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth(se = FALSE) +
geom_hline(yintercept = 0) +
theme_classic()
\(~\)
C.Compare estimated test performance across the models. Which models(s) might you prefer?
\(~\)
D. Use residual plots to evaluate whether some quantitative predictors might be better modeled with nonlinear relationships.
\(~\)
E. Which variables do you think are the most important predictors of your quantitative outcome? Justify your answer. Do the methods you’ve applied reach consensus on which variables are most important? What insights are expected? Surprising?
Decide on an overall best model based on your investigations so far. To do this, make clear your analysis goals. Predictive accuracy? Interpretability? A combination of both?
The model that contains all of the variables in the data set is the best model so far. Based on the fact that that model yielded the lowest CV test error rate, it seems like taking into account of house age, number of rooms, location, ocean proximity, total bedrooms, population, and income level of that block of houses is important to determine house value.
The interpretability of our model is pretty staright forward, though we would like to find out if some individual variables could have more influence on house value and make predictions with that variable.
\(~\) \(~\)
Are there any harms that may come from your analyses and/or how the data were collected? What cautions do you want to keep in mind when communicating your work?
The variable median_income measures the median
income for households within a block of houses. Since this variable
seems to be the most persistent in our model, according to our LASSO
results, this confirms the seriously large wage gap in the United
States. Income affects one’s social status, health care access, and even
house value. We could further infer from our preliminary analysis that
income affects house value because it is more likely that those who have
a higher income are not criminals. Where criminal activity is low,
housing options there attracts more buyers — especially buyers of higher
social status.
The data was derived from Aurélien Géron’s book ‘Hands-On Machine learning with Scikit-Learn and TensorFlow’. It contains the 1990 California census information, specifically on housing. We don’t suspect that the way in which the data was collected is harmful. But we do need to acknowledge that those who did respond to census letters are probably those who have a mailing address and those who are middle to higher income groups. Back in the days, I don’t think there census surveys are delivered eletronically and therefore data collection must have had missed people who do not have a mailing address or P.O. box. Additionally, people who are middle to higher income groups would more likely respond to the census because they can read and write English, they might be more informed about the purpose of a census survey, and they might just have more time to attend to a list of questions printed on multiple pages of paper. People who had lower income and probably had to work so many more hours just to meet ends and the census letter could be the last on their minds. And keep in mind that the data set is specifically on California housing, which is arguably one of the more wealthy and liberal states in the US.
step_ns()
for each quantitative predictor that you want to allow to be
non-linear.deg_free), I recommend
fitting a smoothing spline and use edf to inform your
choice.# Linear Regression Model Spec
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
lm_rec <- recipe(median_house_value ~ ., data = housing)
ns_rec <- lm_rec %>%
step_ns(longitude, deg_free = 10) %>%
step_ns(latitude, deg_free = 10) %>%
step_ns(housing_median_age, deg_free = 10) %>%
step_ns(total_rooms, deg_free = 10) %>%
step_ns(total_bedrooms, deg_free = 10) %>%
step_ns(households, deg_free = 10) %>%
step_ns(population, deg_free = 10)
#degrees of freedom has to be a whole number and larger than edf
ns_wf <- workflow() %>%
add_model(lm_spec) %>%
add_recipe(ns_rec)
data_cv10<- vfold_cv(housing, v = 10)
cv_output <- fit_resamples(
ns_wf,
resamples = data_cv10, # cv folds
metrics = metric_set(mae))
cv_output %>% collect_metrics()
## # A tibble: 1 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 44580. 10 222. Preprocessor1_Model1
# Fit with all data
# ns_mod <- fit(
# lm_spec, #workflow
# data = housing)
\(~\)
gam_spec <-
gen_additive_mod() %>%
set_engine(engine = 'mgcv') %>%
set_mode('regression')
gam_mod <- fit(gam_spec,
median_house_value ~ ocean_proximity + s(longitude) + s(latitude) +
s(housing_median_age) + s(total_rooms) + s(total_bedrooms) +
s(population) + s(households) + s(median_income),
data = housing)
\(~\)
par(mfrow=c(2,2))
gam_mod %>% pluck('fit') %>% mgcv::gam.check()
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 14 iterations.
## The RMS GCV score gradient at convergence was 269.7354 .
## The Hessian was positive definite.
## Model rank = 77 / 77
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(longitude) 9.00 8.95 0.89 <2e-16 ***
## s(latitude) 9.00 8.90 0.90 <2e-16 ***
## s(housing_median_age) 9.00 8.86 1.01 0.700
## s(total_rooms) 9.00 7.00 1.00 0.385
## s(total_bedrooms) 9.00 9.00 1.01 0.875
## s(population) 9.00 8.17 0.94 <2e-16 ***
## s(households) 9.00 4.95 1.01 0.690
## s(median_income) 9.00 8.15 0.97 0.005 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam_mod %>% pluck('fit') %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## median_house_value ~ ocean_proximity + s(longitude) + s(latitude) +
## s(housing_median_age) + s(total_rooms) + s(total_bedrooms) +
## s(population) + s(households) + s(median_income)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 211974 1968 107.712 < 2e-16 ***
## ocean_proximity2 -3028 2270 -1.334 0.182359
## ocean_proximity3 21981 2460 8.936 < 2e-16 ***
## ocean_proximity4 103433 27675 3.737 0.000186 ***
## ocean_proximity5 -20833 2664 -7.820 5.54e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(longitude) 8.950 8.999 130.752 <2e-16 ***
## s(latitude) 8.901 8.997 146.490 <2e-16 ***
## s(housing_median_age) 8.862 8.993 64.501 <2e-16 ***
## s(total_rooms) 7.004 8.188 8.413 <2e-16 ***
## s(total_bedrooms) 9.000 9.000 39.516 <2e-16 ***
## s(population) 8.174 8.754 333.367 <2e-16 ***
## s(households) 4.951 6.171 28.213 <2e-16 ***
## s(median_income) 8.151 8.804 1179.025 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.718 Deviance explained = 71.9%
## GCV = 3.7731e+09 Scale est. = 3.7604e+09 n = 20433
\(~\)
\(~\)
gam_mod %>% pluck('fit') %>% plot(page = 1)
\(~\)
\(~\)
Below are the MAEs yeilded from our models thus far: 1) OLS: 49797.67
2) LASSO: 49791.43
3) GAMs: 44580.23
Decide on an overall best model based on your investigations so far. To do this, make clear your analysis goals. Predictive accuracy? Interpretability? A combination of both?
Are there any harms that may come from your analyses and/or how the data were collected?
\(~\)
What cautions do you want to keep in mind when communicating your work?